### Replication Material ###
### The Manifesto Corpus ### 
### Machine Learning - Figure 4 ###

library(manifestoR)
library(RTextTools)
library(ggplot2)

## set api key
mp_setapikey(key.file = "manifesto_apikey.txt")

# specify corpus version
mp_use_corpus_version("2016-1")


### FUNCTIONS
# define the tm transformations: cleaning on ManifestoDocuments

clean.document <- function(doc, codes.to.delete=c()) {
   doc <- subset(doc,
                 !is.na(content(doc)) &
                    !is.na(codes(doc)) &
                    
                    content(doc) != "" &
                    !(codes(doc) %in% codes.to.delete)
   )
   content(doc) <- gsub("/", "", content(doc))
   
   return(doc)
}

cToLower <- content_transformer(tolower)

clean.document2 <- function(x) {
   stemDocument(stripWhitespace(removeNumbers(removeWords(
      cToLower(removePunctuation(x)), stopwords("german")))), language="german")
}

clean.corpus <- function(corpus, ...) {
   return(tm_map(tm_map(corpus, clean.document, ...), clean.document2))
}

## apply sliding window filter
sliding.window <- function(doc) {
   ct <- content(doc)
   ctnew <- content(doc)
   for (i in 4:(length(ct)-3)) {
      newsent <- rep(ct[i], 6)
      newsent <- c(newsent,
                   rep(ct[i-3],1),
                   rep(ct[i-2],2),
                   rep(ct[i-1],2),
                   rep(ct[i+3],1),
                   rep(ct[i+2],2),
                   rep(ct[i+1],2))
      ctnew[i] <- paste(newsent, collapse=" ")
   }
   content(doc) <- ctnew
   return(doc)
}


### Script PART

train.corpus <- mp_availability(country==41 & (date > 199800 & date < 201000))$availability %>%
   filter(annotations==TRUE) %>%
   mp_corpus()
test.corpus <- mp_corpus(country==41 & date == 201309 & (party<41900))                          

train.corpus.cleaned <- clean.corpus(train.corpus)
test.corpus.cleaned <- clean.corpus(test.corpus)

train.corpus.cleaned <- tm_map(train.corpus.cleaned, sliding.window)
test.corpus.cleaned <- tm_map(test.corpus.cleaned, sliding.window)

traindata.cleaned <- as.data.frame(train.corpus.cleaned,  with.meta=TRUE)
testdata.cleaned <- as.data.frame(test.corpus.cleaned, with.meta=TRUE)

classdata.train <- traindata.cleaned  
classdata.train$train <- 1
classdata.train$segment <- NULL
classdata.test <- testdata.cleaned
classdata.test$train <- 0
classdata <- rbind(classdata.train, classdata.test)

#write partyname in text because a coder would know this as well
classdata$text2 <- paste(classdata$text, classdata$party)
matrix <- create_matrix(classdata$text2, 
                        ngramLength=1,
                        stemWords=TRUE,
                        weighting=tm::weightTfIdf)

nrow(classdata.test)
container <- create_container(matrix,classdata$cmp_code,
                trainSize=1:nrow(classdata.train), 
                testSize=(nrow(classdata.train)+1):(nrow(classdata.train)+nrow(classdata.test)),
                virgin=FALSE)

models <- train_models(container, algorithms="SVM")
results <- classify_models(container, models)

combined <- cbind(results, classdata.test)

codes_c <- data.frame(table(combined$SVM_LABEL,combined$party), stringsAsFactors = FALSE)
codes_h <- data.frame(table(combined$cmp_code,as.numeric(combined$party)), stringsAsFactors = FALSE)

names(codes_c) <- c("code","party","freqc")
names(codes_h) <- c("code","party","freqh")

dfall <- full_join(codes_c,codes_h,by=c("party","code"))

dfall$freqc[is.na(dfall$freqc)] <- 0

dfall$freqc <- as.numeric(as.character(dfall$freqc))
dfall$freqh <- as.numeric(as.character(dfall$freqh))

cor(dfall$freqc,dfall$freqh)
plot(dfall$freqc,dfall$freqh)
plot(log(dfall$freqc),log(dfall$freqh))

table(combined$SVM_LABEL==combined$cmp_code)

ggplot(dfall,aes(freqh,freqc)) + 
   geom_point() + 
   scale_x_log10() + scale_y_log10() +
   xlab("# human-annotated (log-scaled)") + 
   ylab("# computer-annotated (log-scaled)")

ggsave(file="auto-scatter.pdf", width=3.5, height=3.5)
